"""
================================================================================
REPLICATION CODE FOR LEGISLATIVE DURATION ANALYSIS
================================================================================

This script reproduces all tables and figures for the analysis of legislative 
duration in the European Union. 

Author: [Research Team]
Date: 2026
Data: analysis_data.xlsx

REQUIRED PACKAGES:
- pandas
- numpy
- matplotlib
- seaborn
- statsmodels
- scipy

USAGE:
    python replication_script.py

OUTPUT:
    - Table 1: Descriptive Statistics (CSV)
    - Table 2: Baseline Model (CSV)
    - Table 3: Hypothesis Testing Results (CSV)
    - Table A3: Correlation Matrix (CSV)
    - Table A4: Alternative Model Specifications (CSV)
    - Figure 2: Distribution of Legislative Duration (PNG)
    - Figure 3: Interaction Effect - H1 (PNG)
    - Figure 4: Interaction Effect - H2 (PNG)
    - Figure 5: Interaction Effect - H3 (PNG)
    - Figure A1: Coefficient Comparison (PNG)
    - Figure A2: R-squared Comparison (PNG)
    - Figure A3: Coefficient Plot with CI (PNG)
    - Figure A4: Significance Heatmap (PNG)

================================================================================
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import glm
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 13
plt.rcParams['legend.fontsize'] = 10


# ================================================================================
# DATA LOADING
# ================================================================================

def load_data(filepath='analysis_data.xlsx'):
    """Load the dataset"""
    df = pd.read_excel(filepath)
    df.columns = df.columns.str.strip()
    return df


# ================================================================================
# TABLE 1: DESCRIPTIVE STATISTICS
# ================================================================================

def generate_table1(df, output_path='table1_descriptive_statistics.csv'):
    """
    Generate Table 1: Descriptive Statistics
    
    Variables:
    - duration1: Legislative duration (dependent variable)
    - intraconflict: Intra-institutional conflicts
    - hetero_int: Interest groups heterogeneity
    - aitems: Member state homogeneity
    - bitems: Member state heterogeneity
    - int_sup_per: Interest groups support
    - int_opp_per: Interest groups opposition
    - rich_opp_per: Resource-rich groups opposition
    - poor_oop_per: Resource-poor groups opposition
    - interconflict: Inter-institutional conflicts
    - epconflict: Conflicts within the EP
    - saliency: Legislative saliency
    - con_duration: Consultation duration
    - complexity: Legislative complexity
    - turnover: Institutional turnover
    - covid: COVID-19 dummy
    - presidency: Council presidency
    """
    
    variables = ['duration1', 'duration2', 'intraconflict', 'hetero_int', 'aitems', 
                 'bitems', 'int_sup_per', 'int_opp_per', 'rich_opp_per', 'poor_oop_per',
                 'interconflict', 'epconflict', 'saliency', 'con_duration', 
                 'complexity', 'turnover', 'covid', 'presidency']
    
    desc_stats = df[variables].describe()
    desc_stats.loc['N'] = df[variables].count()
    desc_stats.loc['missing'] = df[variables].isnull().sum()
    
    desc_stats.to_csv(output_path)
    print(f"Table 1 saved to {output_path}")
    return desc_stats


# ================================================================================
# TABLE 2: BASELINE MODEL
# ================================================================================

def fit_quasipoisson(formula, data):
    """
    Fit Poisson model with quasi-likelihood dispersion adjustment
    Similar to R's quasipoisson family
    """
    model = glm(formula, data=data, family=sm.families.Poisson()).fit()
    
    n = data.shape[0]
    p = model.df_model + 1
    dispersion = model.pearson_chi2 / (n - p)
    
    model.dispersion = dispersion
    model.bse_adj = model.bse * np.sqrt(dispersion)
    model.tvalues_adj = model.params / model.bse_adj
    model.pvalues_adj = 2 * (1 - stats.t.cdf(np.abs(model.tvalues_adj), n - p))
    
    return model


def generate_table2(df, output_path='table2_baseline_model.csv'):
    """
    Generate Table 2: Baseline Model Results
    """
    formula = ('duration1 ~ intraconflict + hetero_int + aitems + bitems + '
               'int_sup_per + rich_opp_per + poor_oop_per + interconflict + '
               'epconflict + saliency + con_duration + complexity + turnover + '
               'covid + presidency')
    
    model = fit_quasipoisson(formula, df)
    
    results = []
    for var in model.params.index:
        coef = model.params[var]
        se = model.bse_adj[var]
        z = model.tvalues_adj[var]
        p = model.pvalues_adj[var]
        
        results.append({
            'Variable': var,
            'Coefficient': coef,
            'Std. Error': se,
            'z-value': z,
            'P-value': p
        })
    
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_path, index=False)
    print(f"Table 2 saved to {output_path}")
    return result_df


# ================================================================================
# TABLE 3: HYPOTHESIS TESTING
# ================================================================================

def generate_table3(df, output_path='table3_hypothesis_testing.csv'):
    """
    Generate Table 3: Hypothesis Testing Results
    
    Models:
    - Model 1 (H1): Baseline with interaction term intraconflict * hetero_int
    - Model 2a (H2): With interaction aitems * int_sup_per
    - Model 2b (H2): With both aitems and bitems interactions
    - Model 3a (H3): With interaction bitems * rich_opp_per
    - Model 3b (H3): With interaction bitems * poor_oop_per
    """
    
    results = []
    
    # Model 1 (H1): intraconflict * hetero_int
    formula1 = ('duration1 ~ intraconflict * hetero_int + aitems + bitems + '
                'int_sup_per + rich_opp_per + poor_oop_per + interconflict + '
                'epconflict + saliency + con_duration + complexity + turnover + '
                'covid + presidency')
    model1 = glm(formula1, data=df, family=sm.families.Poisson()).fit()
    
    for var in model1.params.index:
        results.append({
            'Model': 'Model 1 (H1)',
            'Variable': var,
            'Coefficient': model1.params[var],
            'Std. Error': model1.bse[var],
            'z-value': model1.tvalues[var],
            'P-value': model1.pvalues[var]
        })
    
    # Model 2a (H2): aitems * int_sup_per
    formula2a = ('duration1 ~ intraconflict + hetero_int + aitems * int_sup_per + '
                 'bitems + rich_opp_per + poor_oop_per + interconflict + '
                 'epconflict + saliency + con_duration + complexity + turnover + '
                 'covid + presidency')
    model2a = glm(formula2a, data=df, family=sm.families.Poisson()).fit()
    
    for var in model2a.params.index:
        results.append({
            'Model': 'Model 2a (H2)',
            'Variable': var,
            'Coefficient': model2a.params[var],
            'Std. Error': model2a.bse[var],
            'z-value': model2a.tvalues[var],
            'P-value': model2a.pvalues[var]
        })
    
    # Model 2b (H2): aitems * int_sup_per + bitems * int_sup_per
    formula2b = ('duration1 ~ intraconflict + hetero_int + aitems + bitems * int_sup_per + '
                 'int_sup_per + rich_opp_per + poor_oop_per + interconflict + '
                 'epconflict + saliency + con_duration + complexity + turnover + '
                 'covid + presidency')
    model2b = glm(formula2b, data=df, family=sm.families.Poisson()).fit()
    
    for var in model2b.params.index:
        results.append({
            'Model': 'Model 2b (H2)',
            'Variable': var,
            'Coefficient': model2b.params[var],
            'Std. Error': model2b.bse[var],
            'z-value': model2b.tvalues[var],
            'P-value': model2b.pvalues[var]
        })
    
    # Model 3a (H3): bitems * rich_opp_per
    formula3a = ('duration1 ~ intraconflict + hetero_int + aitems + bitems * rich_opp_per + '
                 'int_sup_per + poor_oop_per + interconflict + '
                 'epconflict + saliency + con_duration + complexity + turnover + '
                 'covid + presidency')
    model3a = glm(formula3a, data=df, family=sm.families.Poisson()).fit()
    
    for var in model3a.params.index:
        results.append({
            'Model': 'Model 3a (H3)',
            'Variable': var,
            'Coefficient': model3a.params[var],
            'Std. Error': model3a.bse[var],
            'z-value': model3a.tvalues[var],
            'P-value': model3a.pvalues[var]
        })
    
    # Model 3b (H3): bitems * poor_oop_per
    formula3b = ('duration1 ~ intraconflict + hetero_int + aitems + bitems + '
                 'int_sup_per + rich_opp_per + poor_oop_per * bitems + interconflict + '
                 'epconflict + saliency + con_duration + complexity + turnover + '
                 'covid + presidency')
    model3b = glm(formula3b, data=df, family=sm.families.Poisson()).fit()
    
    for var in model3b.params.index:
        results.append({
            'Model': 'Model 3b (H3)',
            'Variable': var,
            'Coefficient': model3b.params[var],
            'Std. Error': model3b.bse[var],
            'z-value': model3b.tvalues[var],
            'P-value': model3b.pvalues[var]
        })
    
    result_df = pd.DataFrame(results)
    result_df.to_csv(output_path, index=False)
    print(f"Table 3 saved to {output_path}")
    return result_df


# ================================================================================
# TABLE A3: CORRELATION MATRIX
# ================================================================================

def generate_table_a3(df, output_path='table_A3_correlation_matrix.csv'):
    """
    Generate Table A3: Correlation Matrix Between Dependent and Independent Variables
    
    Dependent Variable: duration1
    Independent Variables: All other variables
    """
    
    dep_vars = ['duration1']
    indep_vars = ['intraconflict', 'aitems', 'bitems', 'hetero_int', 'int_sup_num', 
                  'int_opp_num', 'int_sup_per', 'int_opp_per', 'rich_opp_num', 
                  'poor_oop_num', 'rich_opp_per', 'poor_oop_per', 'interconflict', 
                  'epconflict', 'saliency', 'con_duration', 'complexity', 'turnover', 
                  'covid', 'presidency']
    
    # Map column names to user's naming convention
    rename_map = {
        'duration1': 'duration1',
        'intraconflict': 'intraconflict',
        'aitems': 'aitems',
        'bitems': 'bitems',
        'hetero_int': 'hetero.int',
        'int_sup_num': 'int.sup.num',
        'int_opp_num': 'int.opp.num',
        'int_sup_per': 'int.sup.per',
        'int_opp_per': 'int.opp.per',
        'rich_opp_num': 'rich.opp.num',
        'poor_oop_num': 'poor.oop.num',
        'rich_opp_per': 'rich.opp.per',
        'poor_oop_per': 'poor.oop.per',
        'interconflict': 'interconflict',
        'epconflict': 'epconflict',
        'saliency': 'saliency',
        'con_duration': 'con.duration',
        'complexity': 'complexity',
        'turnover': 'turnover',
        'covid': 'covid',
        'presidency': 'presidency'
    }
    
    cols = dep_vars + indep_vars
    corr_matrix = df[cols].corr()
    
    # Round to 3 decimal places
    corr_matrix = corr_matrix.round(3)
    
    # Rename columns and index
    corr_matrix.index = [rename_map.get(c, c) for c in corr_matrix.index]
    corr_matrix.columns = [rename_map.get(c, c) for c in corr_matrix.columns]
    
    corr_matrix.to_csv(output_path)
    print(f"Table A3 saved to {output_path}")
    return corr_matrix


# ================================================================================
# TABLE A4: ALTERNATIVE MODEL SPECIFICATIONS
# ================================================================================

def generate_table_a4(df, output_path='table_A4_alternative_models.csv'):
    """
    Generate Table A4: Alternative Model Specifications (Robustness Checks)
    
    Models:
    - Model A4-1: Using duration2 as dependent variable
    - Model A4-2: Reduced controls (dropped turnover, covid, presidency)
    - Model A4-3: Log-transformed duration1
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    X = sm.add_constant(df[indep])
    
    # Model A4-1: duration2 as DV
    y2 = df['duration2']
    model_a4_1 = sm.OLS(y2, X).fit()
    
    # Model A4-2: Reduced controls
    indep_reduced = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
                     'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
                     'saliency', 'con_duration', 'complexity']
    X2 = sm.add_constant(df[indep_reduced])
    model_a4_2 = sm.OLS(df['duration1'], X2).fit()
    
    # Model A4-3: Log-transformed DV
    y_log = np.log(df['duration1'] + 1)
    model_a4_3 = sm.OLS(y_log, X).fit()
    
    def sig_stars(pval):
        if pval < 0.001:
            return '***'
        elif pval < 0.01:
            return '**'
        elif pval < 0.05:
            return '*'
        else:
            return ''
    
    vars_order = ['const'] + indep
    
    rows = []
    for var in vars_order:
        # Model 1
        coef1 = model_a4_1.params[var]
        se1 = model_a4_1.bse[var]
        p1 = model_a4_1.pvalues[var]
        stars1 = sig_stars(p1)
        
        # Model 2
        coef2_val = model_a4_2.params.get(var, np.nan)
        se2_val = model_a4_2.bse.get(var, np.nan)
        p2 = model_a4_2.pvalues.get(var, np.nan)
        stars2 = sig_stars(p2) if not np.isnan(p2) else ''
        
        # Model 3
        coef3 = model_a4_3.params[var]
        se3 = model_a4_3.bse[var]
        p3 = model_a4_3.pvalues[var]
        stars3 = sig_stars(p3)
        
        v1 = f'{coef1:.4f}{stars1}'
        v1_se = f'({se1:.4f})'
        
        if not np.isnan(coef2_val):
            v2 = f'{coef2_val:.4f}{stars2}'
            v2_se = f'({se2_val:.4f})'
        else:
            v2 = ''
            v2_se = ''
        
        v3 = f'{coef3:.4f}{stars3}'
        v3_se = f'({se3:.4f})'
        
        rows.append([var, v1, v1_se, v2, v2_se, v3, v3_se])
    
    rows.append(['N', f'{int(model_a4_1.nobs)}', '', f'{int(model_a4_2.nobs)}', '', f'{int(model_a4_3.nobs)}', ''])
    rows.append(['R2', f'{model_a4_1.rsquared:.4f}', '', f'{model_a4_2.rsquared:.4f}', '', f'{model_a4_3.rsquared:.4f}', ''])
    rows.append(['Adj R2', f'{model_a4_1.rsquared_adj:.4f}', '', f'{model_a4_2.rsquared_adj:.4f}', '', f'{model_a4_3.rsquared_adj:.4f}', ''])
    
    result_df = pd.DataFrame(rows, columns=['Variable', 'Model A4-1', 'SE', 'Model A4-2', 'SE', 'Model A4-3', 'SE'])
    result_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"Table A4 saved to {output_path}")
    return result_df


# ================================================================================
# FIGURE 2: DISTRIBUTION OF LEGISLATIVE DURATION
# ================================================================================

def generate_figure2(df, output_path='figure2_distribution.png'):
    """
    Generate Figure 2: Distribution of Legislative Duration
    
    Left panel: Histogram of legislative duration (in months)
    Right panel: Bar chart of duration categories
    """
    
    df['duration_months'] = df['duration1'] / 30
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Left: Histogram
    ax1 = axes[0]
    ax1.hist(df['duration_months'], bins=30, color='#2E86AB', edgecolor='black', 
             linewidth=0.5, alpha=0.7)
    ax1.axvline(df['duration_months'].mean(), color='red', linestyle='--', 
                linewidth=2, label=f"Mean = {df['duration_months'].mean():.2f}")
    ax1.axvline(df['duration_months'].median(), color='green', linestyle='--', 
                linewidth=2, label=f"Median = {df['duration_months'].median():.2f}")
    ax1.set_xlabel('Legislative Duration (months)', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Frequency', fontsize=12, fontweight='bold')
    ax1.set_title('Distribution of Legislative Duration', fontsize=13, fontweight='bold')
    ax1.legend(loc='upper right', frameon=True)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.grid(axis='y', alpha=0.3, linestyle='--')
    
    # Right: Categorical bars
    ax2 = axes[1]
    duration_bins = pd.cut(df['duration_months'], bins=[0, 1, 2, 3, 6, 12, float('inf')], 
                           labels=['<1 month', '1-2 months', '2-3 months', '3-6 months', 
                                   '6-12 months', '>12 months'])
    counts = duration_bins.value_counts().sort_index()
    colors = ['#2E86AB', '#A23B72', '#F18F01', '#C73E1D', '#3B1F2B', '#95C8D8']
    bars = ax2.bar(range(len(counts)), counts.values, color=colors, edgecolor='black', linewidth=0.5)
    ax2.set_xticks(range(len(counts)))
    ax2.set_xticklabels(counts.index, rotation=45, ha='right', fontsize=10)
    ax2.set_xlabel('Duration Category', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Number of Legislative Processes', fontsize=12, fontweight='bold')
    ax2.set_title('Legislative Duration by Category', fontsize=13, fontweight='bold')
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.grid(axis='y', alpha=0.3, linestyle='--')
    
    for bar, val in zip(bars, counts.values):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, str(val), 
                 ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    plt.suptitle('Figure 2: Distribution of Legislative Duration', 
                 fontsize=15, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure 2 saved to {output_path}")


# ================================================================================
# FIGURE 3: INTERACTION EFFECT - H1
# ================================================================================

def generate_figure3(df, output_path='figure3.png'):
    """
    Generate Figure 3: H1 - Interaction Effect (intraconflict * hetero_int)
    
    Left panel: Interaction plot with predicted values
    Right panel: Coefficient plot with confidence intervals
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    df['intraconflict:hetero_int'] = df['intraconflict'] * df['hetero_int']
    indep_h1 = indep + ['intraconflict:hetero_int']
    
    X_h1 = sm.add_constant(df[indep_h1])
    y = df['duration1']
    model_h1 = sm.OLS(y, X_h1).fit()
    
    mean_vals = df[indep].mean()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Left: Interaction plot
    ax1 = axes[0]
    for idx, intraconflict_val in enumerate([0, 1]):
        x_pred = np.linspace(df['hetero_int'].min(), df['hetero_int'].max(), 100)
        pred_data = pd.DataFrame({'hetero_int': x_pred, 'intraconflict': intraconflict_val})
        for col in indep:
            if col not in ['hetero_int', 'intraconflict']:
                pred_data[col] = mean_vals[col]
        pred_data['intraconflict:hetero_int'] = pred_data['intraconflict'] * pred_data['hetero_int']
        pred_data = sm.add_constant(pred_data[indep_h1], has_constant='add')
        predictions = model_h1.get_prediction(pred_data)
        pred_summary = predictions.summary_frame(alpha=0.05)
        label = 'Intraconflict = 1' if intraconflict_val == 1 else 'Intraconflict = 0'
        ax1.plot(x_pred, pred_summary['mean'], linewidth=2.5, label=label, 
                 color='#2166ac' if intraconflict_val == 0 else '#b2182b')
        ax1.fill_between(x_pred, pred_summary['mean_ci_lower'], pred_summary['mean_ci_upper'], 
                         alpha=0.2, color='#2166ac' if intraconflict_val == 0 else '#b2182b')
    
    ax1.set_xlabel('Interest Groups Heterogeneity', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Predicted Legislative Duration', fontsize=12, fontweight='bold')
    ax1.set_title('Figure 3: H1 - Intraconflict x Heterogeneity\n(Interaction Effect)', 
                  fontsize=13, fontweight='bold')
    ax1.legend(loc='best', framealpha=0.9)
    ax1.grid(True, alpha=0.3)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    
    # Right: Coefficient plot
    ax2 = axes[1]
    vars_h1 = ['intraconflict', 'hetero_int', 'intraconflict:hetero_int']
    vars_h1_labels = ['intra-institutional conflicts', 'heterogeneity (IG)', 
                      'intra-institutional conflicts x heterogeneity (IG)']
    coefs_h1 = [model_h1.params[v] for v in vars_h1]
    pvals_h1 = [model_h1.pvalues[v] for v in vars_h1]
    errors_h1 = [model_h1.bse[v] * 1.96 for v in vars_h1]
    y_pos = np.arange(len(vars_h1))
    colors_bar = ['#2166ac' if c > 0 else '#b2182b' for c in coefs_h1]
    
    ax2.barh(y_pos, coefs_h1, xerr=errors_h1, color=colors_bar, alpha=0.7, 
             capsize=5, edgecolor='black', linewidth=0.5)
    ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(vars_h1_labels, fontsize=10)
    ax2.set_xlabel('Coefficient (with 95% CI)', fontsize=12, fontweight='bold')
    ax2.set_title('H1 Key Coefficients', fontsize=13, fontweight='bold')
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    
    for i, (c, p) in enumerate(zip(coefs_h1, pvals_h1)):
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
        ax2.annotate(f'{c:.3f}{sig}', xy=(c + errors_h1[i] + 0.02, i), fontsize=10)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure 3 saved to {output_path}")


# ================================================================================
# FIGURE 4: INTERACTION EFFECT - H2
# ================================================================================

def generate_figure4(df, output_path='figure4.png'):
    """
    Generate Figure 4: H2 - Interaction Effects
    
    Interaction between member state homogeneity (aitems) and interest groups support
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    df['aitems:int_sup_per'] = df['aitems'] * df['int_sup_per']
    indep_h2 = indep + ['aitems:int_sup_per']
    
    X_h2 = sm.add_constant(df[indep_h2])
    y = df['duration1']
    model_h2 = sm.OLS(y, X_h2).fit()
    
    mean_vals = df[indep].mean()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    ax1 = axes[0]
    for idx, aitems_val in enumerate([0, 1]):
        x_pred = np.linspace(df['int_sup_per'].min(), df['int_sup_per'].max(), 100)
        pred_data = pd.DataFrame({'int_sup_per': x_pred, 'aitems': aitems_val})
        for col in indep:
            if col not in ['int_sup_per', 'aitems']:
                pred_data[col] = mean_vals[col]
        pred_data['aitems:int_sup_per'] = pred_data['aitems'] * pred_data['int_sup_per']
        pred_data = sm.add_constant(pred_data[indep_h2], has_constant='add')
        predictions = model_h2.get_prediction(pred_data)
        pred_summary = predictions.summary_frame(alpha=0.05)
        label = 'Member state homogeneity = 1' if aitems_val == 1 else 'Member state homogeneity = 0'
        ax1.plot(x_pred, pred_summary['mean'], linewidth=2.5, label=label,
                 color='#2166ac' if aitems_val == 0 else '#b2182b')
        ax1.fill_between(x_pred, pred_summary['mean_ci_lower'], pred_summary['mean_ci_upper'],
                         alpha=0.2, color='#2166ac' if aitems_val == 0 else '#b2182b')
    
    ax1.set_xlabel('Interest Groups Support', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Predicted Legislative Duration', fontsize=12, fontweight='bold')
    ax1.set_title('Figure 4: H2 - Member State Homogeneity x Interest Groups Support\n(Interaction Effect)',
                  fontsize=13, fontweight='bold')
    ax1.legend(loc='best', framealpha=0.9)
    ax1.grid(True, alpha=0.3)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    
    # Right: Coefficient plot
    ax2 = axes[1]
    vars_h2 = ['aitems', 'int_sup_per', 'aitems:int_sup_per']
    vars_h2_labels = ['member state homogeneity', 'interest groups support',
                      'member state homogeneity x interest groups support']
    coefs_h2 = [model_h2.params[v] for v in vars_h2]
    pvals_h2 = [model_h2.pvalues[v] for v in vars_h2]
    errors_h2 = [model_h2.bse[v] * 1.96 for v in vars_h2]
    y_pos = np.arange(len(vars_h2))
    colors_bar = ['#2166ac' if c > 0 else '#b2182b' for c in coefs_h2]
    
    ax2.barh(y_pos, coefs_h2, xerr=errors_h2, color=colors_bar, alpha=0.7,
             capsize=5, edgecolor='black', linewidth=0.5)
    ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(vars_h2_labels, fontsize=10)
    ax2.set_xlabel('Coefficient (with 95% CI)', fontsize=12, fontweight='bold')
    ax2.set_title('H2 Key Coefficients', fontsize=13, fontweight='bold')
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    
    for i, (c, p) in enumerate(zip(coefs_h2, pvals_h2)):
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
        ax2.annotate(f'{c:.3f}{sig}', xy=(c + errors_h2[i] + 0.02, i), fontsize=10)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure 4 saved to {output_path}")


# ================================================================================
# FIGURE 5: INTERACTION EFFECT - H3
# ================================================================================

def generate_figure5(df, output_path='figure5.png'):
    """
    Generate Figure 5: H3 - Interaction Effects
    
    Interaction between member state heterogeneity (bitems) and resource-rich groups opposition
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per',
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict',
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    df['bitems:rich_opp_per'] = df['bitems'] * df['rich_opp_per']
    indep_h3 = indep + ['bitems:rich_opp_per']
    
    X_h3 = sm.add_constant(df[indep_h3])
    y = df['duration1']
    model_h3 = sm.OLS(y, X_h3).fit()
    
    mean_vals = df[indep].mean()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    ax1 = axes[0]
    for idx, bitems_val in enumerate([0, 1]):
        x_pred = np.linspace(df['rich_opp_per'].min(), df['rich_opp_per'].max(), 100)
        pred_data = pd.DataFrame({'rich_opp_per': x_pred, 'bitems': bitems_val})
        for col in indep:
            if col not in ['rich_opp_per', 'bitems']:
                pred_data[col] = mean_vals[col]
        pred_data['bitems:rich_opp_per'] = pred_data['bitems'] * pred_data['rich_opp_per']
        pred_data = sm.add_constant(pred_data[indep_h3], has_constant='add')
        predictions = model_h3.get_prediction(pred_data)
        pred_summary = predictions.summary_frame(alpha=0.05)
        label = 'Member state heterogeneity = 1' if bitems_val == 1 else 'Member state heterogeneity = 0'
        ax1.plot(x_pred, pred_summary['mean'], linewidth=2.5, label=label,
                 color='#2166ac' if bitems_val == 0 else '#b2182b')
        ax1.fill_between(x_pred, pred_summary['mean_ci_lower'], pred_summary['mean_ci_upper'],
                         alpha=0.2, color='#2166ac' if bitems_val == 0 else '#b2182b')
    
    ax1.set_xlabel('Resource-rich Groups Opposition', fontsize=12, fontweight='bold')
    ax1.set_ylabel('Predicted Legislative Duration', fontsize=12, fontweight='bold')
    ax1.set_title('Figure 5: H3 - Member State Heterogeneity x Resource-rich Groups Opposition\n(Interaction Effect)',
                  fontsize=13, fontweight='bold')
    ax1.legend(loc='best', framealpha=0.9)
    ax1.grid(True, alpha=0.3)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    
    # Right: Coefficient plot
    ax2 = axes[1]
    vars_h3 = ['bitems', 'rich_opp_per', 'bitems:rich_opp_per']
    vars_h3_labels = ['member state heterogeneity', 'resource-rich groups opposition',
                      'member state heterogeneity x resource-rich groups opposition']
    coefs_h3 = [model_h3.params[v] for v in vars_h3]
    pvals_h3 = [model_h3.pvalues[v] for v in vars_h3]
    errors_h3 = [model_h3.bse[v] * 1.96 for v in vars_h3]
    y_pos = np.arange(len(vars_h3))
    colors_bar = ['#2166ac' if c > 0 else '#b2182b' for c in coefs_h3]
    
    ax2.barh(y_pos, coefs_h3, xerr=errors_h3, color=colors_bar, alpha=0.7,
             capsize=5, edgecolor='black', linewidth=0.5)
    ax2.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(vars_h3_labels, fontsize=10)
    ax2.set_xlabel('Coefficient (with 95% CI)', fontsize=12, fontweight='bold')
    ax2.set_title('H3 Key Coefficients', fontsize=13, fontweight='bold')
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    
    for i, (c, p) in enumerate(zip(coefs_h3, pvals_h3)):
        sig = '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else ''
        ax2.annotate(f'{c:.3f}{sig}', xy=(c + errors_h3[i] + 0.02, i), fontsize=10)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure 5 saved to {output_path}")


# ================================================================================
# FIGURE A1: COEFFICIENT COMPARISON ACROSS ROBUSTNESS MODELS
# ================================================================================

def generate_figure_a1(df, output_path='figure_A1_coefficient_comparison.png'):
    """
    Generate Figure A1: Coefficient Comparison Across Robustness Models
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    X = sm.add_constant(df[indep])
    y2 = df['duration2']
    model_a4_1 = sm.OLS(y2, X).fit()
    
    indep_reduced = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
                     'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
                     'saliency', 'con_duration', 'complexity']
    X2 = sm.add_constant(df[indep_reduced])
    model_a4_2 = sm.OLS(df['duration1'], X2).fit()
    
    y_log = np.log(df['duration1'] + 1)
    model_a4_3 = sm.OLS(y_log, X).fit()
    
    key_vars = ['intraconflict', 'hetero_int', 'aitems', 'int_sup_per', 
                'epconflict', 'saliency', 'con_duration', 'complexity']
    
    var_labels = {
        'intraconflict': 'Intra-institutional\nconflicts',
        'hetero_int': 'Interest groups\nheterogeneity',
        'aitems': 'Member state\nhomogeneity',
        'int_sup_per': 'Interest groups\nsupport',
        'epconflict': 'Conflicts within\nthe EP',
        'saliency': 'Legislative\nsaliency',
        'con_duration': 'Consultation\nduration',
        'complexity': 'Legislative\ncomplexity'
    }
    
    fig, ax = plt.subplots(figsize=(12, 7))
    
    x = np.arange(len(key_vars))
    width = 0.25
    
    coef1 = [model_a4_1.params[v] for v in key_vars]
    coef2 = [model_a4_2.params[v] for v in key_vars]
    coef3 = [model_a4_3.params[v] for v in key_vars]
    
    colors = ['#2E86AB', '#A23B72', '#F18F01']
    
    bars1 = ax.bar(x - width, coef1, width, label='Model A: duration in days', 
                   color=colors[0], edgecolor='black', linewidth=0.5)
    bars2 = ax.bar(x, coef2, width, label='Model B: Reduced', 
                   color=colors[1], edgecolor='black', linewidth=0.5)
    bars3 = ax.bar(x + width, coef3, width, label='Model C: log duration in months', 
                   color=colors[2], edgecolor='black', linewidth=0.5)
    
    ax.set_xlabel('Independent Variables', fontsize=12, fontweight='bold')
    ax.set_ylabel('Coefficient Estimate', fontsize=12, fontweight='bold')
    ax.set_title('Figure A1: Coefficient Comparison Across Robustness Models', 
                 fontsize=13, fontweight='bold', pad=15)
    ax.set_xticks(x)
    ax.set_xticklabels([var_labels[v] for v in key_vars], fontsize=9)
    ax.legend(loc='upper right', frameon=True, fancybox=False, edgecolor='black')
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.8)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure A1 saved to {output_path}")


# ================================================================================
# FIGURE A2: R-SQUARED COMPARISON
# ================================================================================

def generate_figure_a2(df, output_path='figure_A2_r_squared_comparison.png'):
    """
    Generate Figure A2: Model Fit Comparison (R-squared)
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    X = sm.add_constant(df[indep])
    y2 = df['duration2']
    model_a4_1 = sm.OLS(y2, X).fit()
    
    indep_reduced = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
                     'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
                     'saliency', 'con_duration', 'complexity']
    X2 = sm.add_constant(df[indep_reduced])
    model_a4_2 = sm.OLS(df['duration1'], X2).fit()
    
    y_log = np.log(df['duration1'] + 1)
    model_a4_3 = sm.OLS(y_log, X).fit()
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    models = ['Model A\n(duration in days)', 'Model B\n(Reduced)', 'Model C\n(log duration)']
    r2 = [model_a4_1.rsquared, model_a4_2.rsquared, model_a4_3.rsquared]
    adj_r2 = [model_a4_1.rsquared_adj, model_a4_2.rsquared_adj, model_a4_3.rsquared_adj]
    
    x = np.arange(len(models))
    width = 0.35
    
    colors = ['#2E86AB', '#A23B72']
    bars1 = ax.bar(x - width/2, r2, width, label='R-squared', 
                   color=colors[0], edgecolor='black', linewidth=0.5)
    bars2 = ax.bar(x + width/2, adj_r2, width, label='Adj R-squared', 
                   color=colors[1], edgecolor='black', linewidth=0.5)
    
    ax.set_ylabel('Value', fontsize=12, fontweight='bold')
    ax.set_title('Figure A2: Model Fit Comparison', fontsize=13, fontweight='bold', pad=15)
    ax.set_xticks(x)
    ax.set_xticklabels(models, fontsize=10)
    ax.legend(loc='upper right', frameon=True)
    ax.set_ylim(0, 1)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    
    for bar, val in zip(bars1, r2):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', 
                ha='center', va='bottom', fontsize=10, fontweight='bold')
    for bar, val in zip(bars2, adj_r2):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', 
                ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure A2 saved to {output_path}")


# ================================================================================
# FIGURE A3: COEFFICIENT PLOT WITH CONFIDENCE INTERVALS
# ================================================================================

def generate_figure_a3(df, output_path='figure_A3_coefficient_plot.png'):
    """
    Generate Figure A3: Coefficient Plot with 95% Confidence Intervals
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    X = sm.add_constant(df[indep])
    y_log = np.log(df['duration1'] + 1)
    model_a4_3 = sm.OLS(y_log, X).fit()
    
    var_labels = {
        'intraconflict': 'Intra-institutional conflicts',
        'hetero_int': 'Interest groups heterogeneity',
        'aitems': 'Member state homogeneity',
        'bitems': 'Member state heterogeneity',
        'int_sup_per': 'Interest groups support',
        'rich_opp_per': 'Resource-rich groups opposition',
        'poor_oop_per': 'Resource-poor groups opposition',
        'interconflict': 'Inter-institutional conflicts',
        'epconflict': 'Conflicts within the EP',
        'saliency': 'Legislative saliency',
        'con_duration': 'Consultation duration',
        'complexity': 'Legislative complexity',
        'turnover': 'Institutional turnover',
        'covid': 'COVID-19',
        'presidency': 'Council presidency'
    }
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    vars_plot = indep
    y_pos = np.arange(len(vars_plot))
    
    conf = model_a4_3.conf_int()
    coef = [model_a4_3.params[v] for v in vars_plot]
    ci_lower = [conf.loc[v, 0] for v in vars_plot]
    ci_upper = [conf.loc[v, 1] for v in vars_plot]
    
    colors = ['#2E86AB' if model_a4_3.pvalues[v] < 0.05 else '#999999' for v in vars_plot]
    
    for i, (c, cl, cu, color) in enumerate(zip(coef, ci_lower, ci_upper, colors)):
        ax.plot([cl, cu], [i, i], color=color, linewidth=1.5)
        ax.scatter(c, i, color=color, s=80, zorder=5, edgecolor='black', linewidth=0.5)
    
    ax.axvline(x=0, color='red', linestyle='--', linewidth=1)
    ax.set_yticks(y_pos)
    ax.set_yticklabels([var_labels[v] for v in vars_plot], fontsize=10)
    ax.set_xlabel('Coefficient Estimate', fontsize=12, fontweight='bold')
    ax.set_title('Figure A3: Coefficient Plot with 95% Confidence Interval\n(Model C: log-transformed duration)', 
                 fontsize=13, fontweight='bold', pad=15)
    ax.grid(axis='x', alpha=0.3, linestyle='--')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure A3 saved to {output_path}")


# ================================================================================
# FIGURE A4: SIGNIFICANCE HEATMAP
# ================================================================================

def generate_figure_a4(df, output_path='figure_A4_significance_heatmap.png'):
    """
    Generate Figure A4: Statistical Significance Across Robustness Models
    """
    
    indep = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
             'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
             'saliency', 'con_duration', 'complexity', 'turnover', 'covid', 'presidency']
    
    X = sm.add_constant(df[indep])
    y2 = df['duration2']
    model_a4_1 = sm.OLS(y2, X).fit()
    
    indep_reduced = ['intraconflict', 'hetero_int', 'aitems', 'bitems', 'int_sup_per', 
                     'rich_opp_per', 'poor_oop_per', 'interconflict', 'epconflict', 
                     'saliency', 'con_duration', 'complexity']
    X2 = sm.add_constant(df[indep_reduced])
    model_a4_2 = sm.OLS(df['duration1'], X2).fit()
    
    y_log = np.log(df['duration1'] + 1)
    model_a4_3 = sm.OLS(y_log, X).fit()
    
    var_labels = {
        'intraconflict': 'Intra-institutional conflicts',
        'hetero_int': 'Interest groups heterogeneity',
        'aitems': 'Member state homogeneity',
        'bitems': 'Member state heterogeneity',
        'int_sup_per': 'Interest groups support',
        'rich_opp_per': 'Resource-rich groups opposition',
        'poor_oop_per': 'Resource-poor groups opposition',
        'interconflict': 'Inter-institutional conflicts',
        'epconflict': 'Conflicts within the EP',
        'saliency': 'Legislative saliency',
        'con_duration': 'Consultation duration',
        'complexity': 'Legislative complexity',
        'turnover': 'Institutional turnover',
        'covid': 'COVID-19',
        'presidency': 'Council presidency'
    }
    
    vars_all = indep
    
    sig_matrix = []
    for v in vars_all:
        p1 = model_a4_1.pvalues.get(v, 1)
        p2 = model_a4_2.pvalues.get(v, 1)
        p3 = model_a4_3.pvalues.get(v, 1)
        
        s1 = 1 if p1 < 0.001 else (0.8 if p1 < 0.01 else (0.6 if p1 < 0.05 else 0))
        s2 = 1 if p2 < 0.001 else (0.8 if p2 < 0.01 else (0.6 if p2 < 0.05 else 0))
        s3 = 1 if p3 < 0.001 else (0.8 if p3 < 0.01 else (0.6 if p3 < 0.05 else 0))
        sig_matrix.append([s1, s2, s3])
    
    sig_matrix = np.array(sig_matrix)
    
    fig, ax = plt.subplots(figsize=(10, 10))
    
    im = ax.imshow(sig_matrix, cmap='Blues', aspect='auto', vmin=0, vmax=1)
    
    ax.set_xticks([0, 1, 2])
    ax.set_xticklabels(['Model A\n(duration in days)', 'Model B\n(Reduced)', 'Model C\n(log duration)'], fontsize=11)
    ax.set_yticks(range(len(vars_all)))
    ax.set_yticklabels([var_labels[v] for v in vars_all], fontsize=10)
    ax.set_title('Figure A4: Statistical Significance Across Robustness Models\n(darker = more significant)', 
                 fontsize=13, fontweight='bold', pad=15)
    
    for i in range(len(vars_all)):
        for j in range(3):
            if sig_matrix[i, j] > 0:
                stars = '***' if sig_matrix[i, j] == 1 else ('**' if sig_matrix[i, j] == 0.8 else '*')
                ax.text(j, i, stars, ha='center', va='center', fontsize=9, 
                       color='white' if sig_matrix[i,j] > 0.5 else 'black')
    
    cbar = plt.colorbar(im, ax=ax, shrink=0.8)
    cbar.set_ticks([0, 0.6, 0.8, 1])
    cbar.set_ticklabels(['Not significant', '* p<0.05', '** p<0.01', '*** p<0.001'])
    cbar.ax.tick_params(labelsize=9)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"Figure A4 saved to {output_path}")


# ================================================================================
# MAIN EXECUTION
# ================================================================================

def main():
    """
    Main function to generate all tables and figures
    """
    print("="*70)
    print("REPLICATION CODE FOR LEGISLATIVE DURATION ANALYSIS")
    print("="*70)
    
    # Load data
    print("\nLoading data...")
    df = load_data('analysis_data.xlsx')
    print(f"Data loaded: {len(df)} observations")
    
    # Generate Tables
    print("\n" + "="*70)
    print("GENERATING TABLES")
    print("="*70)
    
    generate_table1(df)
    generate_table2(df)
    generate_table3(df)
    generate_table_a3(df)
    generate_table_a4(df)
    
    # Generate Figures
    print("\n" + "="*70)
    print("GENERATING FIGURES")
    print("="*70)
    
    generate_figure2(df)
    generate_figure3(df)
    generate_figure4(df)
    generate_figure5(df)
    generate_figure_a1(df)
    generate_figure_a2(df)
    generate_figure_a3(df)
    generate_figure_a4(df)
    
    print("\n" + "="*70)
    print("ALL TABLES AND FIGURES GENERATED SUCCESSFULLY")
    print("="*70)


if __name__ == "__main__":
    main()
